import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from time import time
import warnings
warnings.filterwarnings('ignore')
# Augmented Dickey-Fuller Test - Checking Stationarity
def perform_adf_test(series):
result = adfuller(series)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
#data = pd.read_csv('C:/Users/migue/Desktop/data_preprocess.csv')
#data.head()
#data.info()
#---------------------------- CREATING Poin-of-Sale df -----------
# Sales grouped by POS and week
#df=data.loc[:,['Date','Point-of-Sale_ID','Value','Units']].copy()
#df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
#df['week'] = df['Date'] - pd.to_timedelta(7, unit='d')
#df.sort_values(by='Date')
#vdf = df.groupby(['Point-of-Sale_ID', pd.Grouper(key='week', freq='W-MON')])['Units'].sum().reset_index().sort_values('week')
#vdf.to_csv('C:/Users/migue/Desktop/unit_df.csv')
#---------------------------- CREATING Product df -----------
# Sales grouped by Product and week
#df_prod = data.loc[:,['Date','ProductName_ID','Units']].copy()
#df_prod['Date'] = pd.to_datetime(df_prod['Date'], format='%Y-%m-%d')
#df_prod['week'] = df_prod['Date'] - pd.to_timedelta(7, unit='d')
#df_prod.sort_values(by='Date')
#productdf = df_prod.groupby(['ProductName_ID', pd.Grouper(key='week', freq='W-MON')])['Units'].sum()#.reset_index().sort_values('week')
#productdf.to_csv('C:/Users/migue/Desktop/product_df.csv')
#---------------------------- CREATING Product & POS df -----------
#df=data.loc[:,['Date','Point-of-Sale_ID','ProductName_ID','Units']].copy()
#df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
#df['week'] = df['Date'] - pd.to_timedelta(7, unit='d')
#df['id']=df['Point-of-Sale_ID'].astype(str)+'-'+df['ProductName_ID'].astype(str)
#df = df.groupby(['id', pd.Grouper(key='week', freq='W-MON')])['Units'].sum().reset_index().sort_values('week')
#columns = df['id'].str.split(r"-", expand=True)
#finaldf = pd.concat([df,columns],axis=1)
#finaldf.rename(columns={0: 'Point-of-Sale_ID', 1: 'ProductName_ID'},inplace=True)
#finaldf = finaldf[['week', 'Units', 'Point-of-Sale_ID', 'ProductName_ID']].copy()
#finaldf['Point-of-Sale_ID']=finaldf['Point-of-Sale_ID'].astype(int)
#finaldf['ProductName_ID']=finaldf['ProductName_ID'].astype(int)
#finaldf.to_csv('C:/Users/migue/Desktop/forecast_df.csv',index=False)
#---------------------------- Load Product & POS df -----------
fdf = pd.read_csv('C:/Users/migue/Desktop/Datasets/forecast_df.csv')
fdf['week'] = pd.to_datetime(fdf['week'], format='%Y-%m-%d')
fdf.head(3)
| week | Units | Point-of-Sale_ID | ProductName_ID | |
|---|---|---|---|---|
| 0 | 2015-12-28 | 1.0 | 138 | 53 |
| 1 | 2015-12-28 | 2.0 | 271 | 1366 |
| 2 | 2015-12-28 | 3.0 | 271 | 1369 |
fdf.info(3)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29589264 entries, 0 to 29589263 Data columns (total 4 columns): # Column Dtype --- ------ ----- 0 week datetime64[ns] 1 Units float64 2 Point-of-Sale_ID int64 3 ProductName_ID int64 dtypes: datetime64[ns](1), float64(1), int64(2) memory usage: 903.0 MB
#---------------------------- CREATING Product & POS df only for DASH -----------
#fdf.groupby('ProductName_ID').sum().sort_values('Units').tail(20).index
#fdf.groupby('Point-of-Sale_ID').sum().sort_values('Units').tail(20).index
#forecast_pos_pid_dash = fdf[(fdf['Point-of-Sale_ID'].isin([362,410,252,347,360,42,37,356,62,72,359,48,282,383,103, 92,272,280,78,292]))&(fdf['ProductName_ID'].isin([207,130,356,55,21,912,738,1408,1846,567,1422,226,847,2802,1234,1147,993,481,2609,1277]))]
#forecast_pos_pid_dash.to_csv('C:/Users/migue/Desktop/forecast_pos_pid_dash.csv',index=False)
#---------------------------- Load Product Df -----------
vdf = pd.read_csv('C:/Users/migue/Desktop/Datasets/unit_df.csv')
vdf['week'] = pd.to_datetime(vdf['week'], format='%Y-%m-%d')
vdf.head(3)
| Unnamed: 0 | Point-of-Sale_ID | week | Units | |
|---|---|---|---|---|
| 0 | 0 | 1 | 2015-12-28 | 1027.0 |
| 1 | 55043 | 275 | 2015-12-28 | 915.0 |
| 2 | 54842 | 274 | 2015-12-28 | 735.0 |
vdf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 82379 entries, 0 to 82378 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 82379 non-null int64 1 Point-of-Sale_ID 82379 non-null int64 2 week 82379 non-null datetime64[ns] 3 Units 82379 non-null float64 dtypes: datetime64[ns](1), float64(1), int64(2) memory usage: 2.5 MB
vdf[vdf['Point-of-Sale_ID']==1].head(3)
| Unnamed: 0 | Point-of-Sale_ID | week | Units | |
|---|---|---|---|---|
| 0 | 0 | 1 | 2015-12-28 | 1027.0 |
| 799 | 1 | 1 | 2016-01-04 | 3561.0 |
| 1192 | 2 | 1 | 2016-01-11 | 3108.0 |
#---------------------------- Load Product Df -----------
pdf = pd.read_csv('C:/Users/migue/Desktop/Datasets/product_df.csv')
pdf['week'] = pd.to_datetime(pdf['week'], format='%Y-%m-%d')
pdf.head(3)
| Unnamed: 0 | ProductName_ID | week | Units | |
|---|---|---|---|---|
| 0 | 150840 | 1425 | 2015-12-28 | 114.0 |
| 1 | 156252 | 1482 | 2015-12-28 | 46.0 |
| 2 | 64247 | 631 | 2015-12-28 | 19.0 |
pdf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 308494 entries, 0 to 308493 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 308494 non-null int64 1 ProductName_ID 308494 non-null int64 2 week 308494 non-null datetime64[ns] 3 Units 308494 non-null float64 dtypes: datetime64[ns](1), float64(1), int64(2) memory usage: 9.4 MB
pdf[pdf['ProductName_ID']==1].head(3)
| Unnamed: 0 | ProductName_ID | week | Units | |
|---|---|---|---|---|
| 271799 | 0 | 1 | 2019-05-27 | 4.0 |
| 273864 | 1 | 1 | 2019-06-03 | 2.0 |
| 274612 | 2 | 1 | 2019-06-10 | 1.0 |
#---------------------------- CREATING Product df -----------
weekly=vdf.groupby('week')['Units'].sum()
weekly.head(3)
weekly=weekly[1:-1] # removing the first and last date for lack of values
weekly.head(3)
week 2016-01-04 1111641.0 2016-01-11 1052584.0 2016-01-18 1006466.0 Name: Units, dtype: float64
fig = px.line(x=weekly.index, y=weekly, labels={'x':'Date', 'y':'Units'})
fig.show()
#infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
dates = lim_df.index
lim_df.head(3)
week 2016-01-04 1111641.0 2016-01-11 1052584.0 2016-01-18 1006466.0 Freq: W-MON, Name: Units, dtype: float64
We will check startionarity by performing a Augmented Dickey-Fuller Test
perform_adf_test(lim_df)
ADF Statistic: -2.736477 p-value: 0.067936
We will have to remove the trend. Basically taking each data point and subtract the datapoint from the month that comes prior.
first_diff = lim_df.diff()[1:]
first_diff.head(3)
week 2016-01-11 -59057.0 2016-01-18 -46118.0 2016-01-25 -11394.0 Freq: W-MON, Name: Units, dtype: float64
fig = px.line(x=first_diff.index, y=first_diff, labels={'x':'Date', 'y':'Value'})
fig.show()
Straightaway we can see that now the time series is distributed around 0. Now let's perform the ADF test to see.
perform_adf_test(first_diff)
ADF Statistic: -5.346396 p-value: 0.000004
We can now move on with the model.
acf_vals = acf(first_diff)
fig = px.bar(y=acf_vals)
fig.show()
There seems to exist some repetition in 4 to 4 marks (Maybe Monthly???), however those may not be enough to warrant seasonality.
pacf_vals = pacf(first_diff)
fig = px.bar(y=pacf_vals)
fig.show()
There seems to exist some repetition in 3 to 4 lags (Maybe Monthly???), however those may not be enough to warrant seasonality.
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
train_data.tail(3)
week 2019-08-12 878489.0 2019-08-19 916890.0 2019-08-26 912978.0 Freq: W-MON, Name: Units, dtype: float64
test_data.head(3)
week 2019-09-02 972914.0 2019-09-09 1024448.0 2019-09-16 1000358.0 Freq: W-MON, Name: Units, dtype: float64
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
#fit the model
start = time()
model_fit = model.fit()
end = time()
print('Model Fitting Time:', end - start)
Model Fitting Time: 4.559351444244385
#summary of the model
print(model_fit.summary())
SARIMAX Results
============================================================================================
Dep. Variable: Units No. Observations: 191
Model: SARIMAX(1, 1, 0)x(0, 1, [1], 52) Log Likelihood -1744.378
Date: Wed, 02 Jun 2021 AIC 3494.756
Time: 21:29:03 BIC 3503.538
Sample: 01-04-2016 HQIC 3498.325
- 08-26-2019
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.4345 0.028 -15.543 0.000 -0.489 -0.380
ma.S.L52 -0.4147 0.051 -8.169 0.000 -0.514 -0.315
sigma2 5.577e+09 1.69e-12 3.31e+21 0.000 5.58e+09 5.58e+09
===================================================================================
Ljung-Box (L1) (Q): 7.71 Jarque-Bera (JB): 24.83
Prob(Q): 0.01 Prob(JB): 0.00
Heteroskedasticity (H): 0.81 Skew: -0.46
Prob(H) (two-sided): 0.49 Kurtosis: 4.87
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 6.68e+36. Standard errors may be unstable.
#get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data)+6)
predictions_error = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions_error
predictions
2019-09-02 9.768000e+05 2019-09-09 9.943063e+05 2019-09-16 9.939194e+05 2019-09-23 1.004472e+06 2019-09-30 9.747992e+05 2019-10-07 1.114921e+06 2019-10-14 1.042077e+06 2019-10-21 1.020082e+06 2019-10-28 9.654386e+05 2019-11-04 1.147454e+06 2019-11-11 1.095557e+06 2019-11-18 1.072562e+06 2019-11-25 1.048237e+06 2019-12-02 1.144891e+06 Freq: W-MON, Name: predicted_mean, dtype: float64
residuals
week 2019-09-02 -3886.003907 2019-09-09 30141.681097 2019-09-16 6438.587710 2019-09-23 14154.461446 2019-09-30 67986.827544 2019-10-07 68664.139430 2019-10-14 55397.070895 2019-10-21 48256.675012 Freq: W-MON, dtype: float64
fig = px.line(x=residuals.index, y=residuals, labels={'x':'Date', 'y':'Error'})
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=lim_df.index, y=lim_df,
mode='lines',
name='TimeSeries'))
fig.add_trace(go.Scatter(x=predictions.index, y=predictions,
mode='lines',
name='Predictions'))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
#fig.update_yaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
fig.show()
print('Mean Absolute Percent Error:', round(np.mean(abs(residuals/test_data)),4))
Mean Absolute Percent Error: 0.0341
print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))
Root Mean Squared Error: 44581.30362955245
After testing the models for sucessive iterations we tried to better understand the relation, troughout time the and as we go further into the forecast the errors normally tend to increase.
TO NOTE: The first 5 forecasts are really good and from there normally the forecasts tended to get worse
By using seasonal 1 year lag on forecast we managed to get the significantly better results than with monthly lags.
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = lim_df[:train_end-timedelta(days=1)]
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
rolling_residuals = test_data - rolling_predictions
rolling_residuals
week 2019-09-02 -3886.003907 2019-09-09 32361.418803 2019-09-16 -8860.728184 2019-09-23 -2541.748825 2019-09-30 57187.108832 2019-10-07 23966.871938 2019-10-14 -13015.738632 2019-10-21 -12850.580135 Freq: W-MON, Name: Units, dtype: float64
fig = px.line(x=rolling_residuals.index, y=rolling_residuals, labels={'x':'Date', 'y':'Error'})
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=lim_df.index, y=lim_df,
mode='lines',
name='TimeSeries'))
fig.add_trace(go.Scatter(x=rolling_predictions.index, y=rolling_predictions,
mode='lines',
name='Predictions'))
fig.show()
print('Mean Absolute Percent Error:', round(np.mean(abs(rolling_residuals/test_data)),4))
Mean Absolute Percent Error: 0.0182
print('Root Mean Squared Error:', np.sqrt(np.mean(rolling_residuals**2)))
Root Mean Squared Error: 25803.72713705992
def forecast_by_POS(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data))
predictions = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
return predictions,residuals
def graph_pred(POS_ID):
# Data
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data)+6)
#predictions = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=lim_df.index, y=lim_df,
mode='lines',
name='TimeSeries'))
fig.add_trace(go.Scatter(x=predictions.index, y=predictions,
mode='lines',
name='Predictions'))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
#fig.update_yaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
return fig.show()
def graph_residuals(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data))
predictions = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
fig = px.line(x=residuals.index, y=residuals, labels={'x':'Date', 'y':'Error'})
return fig.show()
def indicators(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data))
predictions = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
MAPE = round(np.mean(abs(residuals/test_data)),4)
RMSE = np.sqrt(np.mean(residuals**2))
return MAPE,RMSE
graph_pred(POS_ID=[292])
graph_residuals(POS_ID=[69])
indicators(POS_ID=[292])
(0.0321, 286.9648790752983)
def rolling_forecast_by_POS(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = lim_df[:train_end-timedelta(days=1)]
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
rolling_residuals = test_data - rolling_predictions
return rolling_predictions,rolling_residuals
def rolling_graph_pred(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = lim_df[:train_end-timedelta(days=1)]
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
rolling_residuals = test_data - rolling_predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=lim_df.index, y=lim_df,
mode='lines',
name='TimeSeries'))
fig.add_trace(go.Scatter(x=rolling_predictions.index, y=rolling_predictions,
mode='lines',
name='Predictions'))
return fig.show()
def rolling_graph_residuals(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = lim_df[:train_end-timedelta(days=1)]
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
rolling_residuals = test_data - rolling_predictions
fig = px.line(x=rolling_residuals.index, y=rolling_residuals, labels={'x':'Date', 'y':'Error'})
return fig.show()
def rolling_indicators(POS_ID):
# Setting up the group by on the Point-of-Sale_ID
grouped=vdf[vdf['Point-of-Sale_ID'].isin(POS_ID)].groupby('week')['Units'].sum()
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = lim_df[:train_end-timedelta(days=1)]
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
rolling_residuals = test_data - rolling_predictions
MAPE = round(np.mean(abs(rolling_residuals/test_data)),4)
RMSE = np.sqrt(np.mean(rolling_residuals**2))
return MAPE,RMSE
#rolling_graph_pred(POS_ID=[69])
#rolling_graph_residuals(POS_ID=[69])
#rolling_indicators(POS_ID=[69])
def forecast_kpi_by_product(PID):
# Setting up the group by on the Point-of-Sale_ID
grouped=pdf[pdf['ProductName_ID'].isin(PID)].groupby('week')['Units'].sum()
grouped = pd.Series(grouped, index=dates).fillna(0)
# removing the first and last date for lack of values
weekly=grouped[1:-1]
#-----------// FORECASTING //-----------#
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Get First Diferences to eliminate Trend
first_diff = lim_df.diff()[1:]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
# SARIMA
my_order = (0,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data)+6)
#predictions = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
MAPE = round(np.mean(abs(residuals/test_data)),4)
RMSE = round(np.sqrt(np.mean(residuals**2)),4)
return MAPE,RMSE
def graph_pred(PID):
# Setting up the group by on the ProductName_ID
grouped=pdf[pdf['ProductName_ID'].isin(PID)].groupby('week')['Units'].sum()
grouped = pd.Series(grouped, index=dates).fillna(0)
# removing the first and last date for lack of values
weekly=grouped[1:-1]
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
zeros_prediction = test_data.copy().isna().replace(to_replace=False,value=0)
# SETTING PREDICTIONS FOR ALMOST DISCONTINUED ITEMS TO 0
if train_data[datetime(2019,1,1):].mean()<1:
predictions = zeros_prediction
else:
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data)+6)
predictions = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=lim_df.index, y=lim_df,
mode='lines',
name='TimeSeries'))
fig.add_trace(go.Scatter(x=predictions.index, y=predictions,
mode='lines',
name='Predictions'))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
#fig.update_yaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
return fig.show()
graph_pred(PID=[2609])
forecast_kpi_by_product(PID=[2609])
(0.0446, 1016.5463)
def forecast_by_POS_and_PID(POS,PID):
# Setting up the group by on the ProductName_ID
grouped=fdf[fdf['ProductName_ID'].isin(PID)][fdf['Point-of-Sale_ID'].isin(POS)].groupby('week')['Units'].sum()
grouped = pd.Series(grouped, index=dates).fillna(0)
# removing the first and last date for lack of values
weekly=grouped[1:-1]
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
zeros_prediction = test_data.copy().isna().replace(to_replace=False,value=0)
# SETTING PREDICTIONS FOR ALMOST DISCONTINUED ITEMS TO 0
if train_data[datetime(2019,1,1):].mean()<1:
predictions = zeros_prediction
else:
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data)+6)
predictions_for_measures = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=lim_df.index, y=lim_df,
mode='lines',
name='TimeSeries'))
fig.add_trace(go.Scatter(x=predictions.index, y=predictions,
mode='lines',
name='Predictions'))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
#fig.update_yaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
return fig.show()
def kpi_by_POS_and_PID(PID,POS):
# Setting up the group by on the ProductName_ID
grouped=fdf[fdf['ProductName_ID'].isin(PID)][fdf['Point-of-Sale_ID'].isin(POS)].groupby('week')['Units'].sum()
grouped = pd.Series(grouped, index=dates).fillna(0)
# removing the first and last date for lack of values
weekly=grouped[1:-1]
# Infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
# Set DF
start_date = datetime(2016,1,1)
end_date = datetime(2019,11,1)
lim_df = forecast[start_date:end_date]
# Set Train and Test Values
train_end = datetime(2019,8,30)
test_end = datetime(2019,11,1)
train_data = lim_df[:train_end]
test_data = lim_df[train_end + timedelta(days=1):test_end]
zeros_prediction = test_data.copy().isna().replace(to_replace=False,value=0)
# SETTING PREDICTIONS FOR ALMOST DISCONTINUED ITEMS TO 0
if train_data[datetime(2019,1,1):].mean()<1:
predictions = zeros_prediction
else:
# SARIMA
my_order = (1,1,0) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 1, 52) #
# Define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit()
# Get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data))
predictions_for_measures = pd.Series(predictions, index=test_data.index)
residuals = test_data - predictions
MAPE = round(np.mean(abs(residuals/test_data)),4)
RMSE = round(np.sqrt(np.mean(residuals**2)),4)
return MAPE,RMSE
forecast_by_POS_and_PID(POS=[62,72,359,48,282,383,103, 92,272,280,78,292],PID=[1277])
kpi_by_POS_and_PID(POS=[62,72,359,48,282,383,103, 92,272,280,78,292],PID=[1277])
(0.0411, 71.1136)